Metagenome Analysis

Only ancestral, streptomycin and control treatments are included, unless otherwise stated.

Authors

Niina Smolander

Shane Hogle

Libraries and functions

Prior to this analysis, the variants have been filtered using Mutect2’s FilterMutectCalls as well as separately for SNPs and INDELs using the following hard filtering factors: SOR, FS, MQ, MQRankSum, ReadPosRankSum, SNP/INDEL clustering (3/35) and SNP/INDEL proximity.

Code
library(tidyverse)
library(Polychrome)
library(withr)
library(fs)
library(compositions)
library(vegan)
library(ggVennDiagram)
library(qvalue)

source("data/input/utils_parallelism.R")

1) Parallel gene variant frequencies

Nonsynonymous variant allele frequencies of the significantly parallel genes (see the table in the section parallelism, inspecting individual genes (2.3)). Gene names used where available, otherwise prokka annotations or descriptions from eggnog are used in the title. Low abundance strains (from the 16S amplicon data) indicated with stars (* if the abundance < 0.05; ** if < 0.01). Heatmaps only include parallel genes, hence if a gene was found significantly parallel in the t0 but not in t3, the panel only contains variants of that gene for t0.

Code
species_counts_md <- read.delim("data/input/species_counts_md.tsv")

counts <- species_counts_md %>% filter(day == 0 | day == 84) %>%
  group_by(measure_env, evolution_env, replicate, day) %>%
  mutate(read_sum = sum(count_correct)) %>%
  ungroup() %>%
  mutate(proportion = count_correct/read_sum)


par_genes <- read_tsv("data/results/parallel_genes_incl_t0.tsv")

variants_strep <- read_rds("data/input/variants_strep.rds")

variants_par <- inner_join(variants_strep, par_genes, by = join_by(Home, Measure, GENE == "locus_tag")) %>%
  mutate(gene_label = ifelse(Preferred_name == "-" | is.na(Preferred_name), prokka_annotation, Preferred_name)) %>%
  mutate(gene_label = ifelse(gene_label == "hypothetical protein", Description, gene_label)) %>%
  mutate(gene_label = ifelse(gene_label == "-" | is.na(gene_label), GENE, gene_label)) %>%
  select(HAMBI, GENE, CHROM, POS, ALT, VAF, home_env, measure_env, replicate, gene_label) %>%
  group_by(GENE, home_env, measure_env) %>%
  mutate(count_repl = n_distinct(replicate)) %>%
  ungroup() %>%
  mutate(home_repl = paste(home_env, replicate, sep = "___")) 


variants_par_t3 <- variants_par %>%
  filter(measure_env != "t0") 

variants_par_t0 <- variants_par %>%
  filter(measure_env == "t0",
         home_repl %in% variants_par_t3$home_repl)
  
genes_par <- bind_rows(variants_par_t3, variants_par_t0) %>%
  mutate(var = paste0(GENE, "_", POS, "_", ALT, " in replicate ", replicate),
         timepoint = ifelse(measure_env == "t0", "t0", "t3"),
         name = ifelse(home_env == "Home: Anc",
                       paste("ANC", measure_env, sep = " -> \n"),
                       ifelse(timepoint == "t0",
                              "t0",
                              paste(measure_env, sep = "_"))),
         name = factor(name, levels = c("t0", "Meas: B", "Meas: BS", "ANC -> \nMeas: B", "ANC -> \nMeas: BS"))) %>%
  mutate(measure_env2 = ifelse(measure_env == "t0", "none",
                              ifelse(measure_env == "Meas: BS", "bact_strep", "bact")),
         home_env2 = ifelse(home_env == "Home: B", "bact",
                           ifelse(home_env == "Home: BS", "bact_strep",
                                  "anc")))
counts <- counts %>%
  mutate(day = ifelse(day == 0, "t0",
                      ifelse(day == 84, "t3", day)))


genes_par <- left_join(genes_par, select(counts, proportion, count_correct, day, strainID, replicate, evolution_env, measure_env),
          by = join_by(home_env2 == evolution_env, measure_env2 == measure_env, replicate, HAMBI == strainID)) %>%
  mutate(low_ab = ifelse(proportion >= 0.05, "",
                        ifelse(proportion <= 0.01, "**", "*")))

1.1) Home: B and Home: ANC

Aims: (1) Do similar variants appear in Home: B and Home: ANC, especially in response to the streptomycin treatment (“Meas: BS” and “ANC -> Meas: BS”, on the x-axis). For example variants in HAMBI_1977 rsmG appear in both Home: B and Home: ANC when introduced to streptomycin. (2) What happens to the variants that were in parallel genes in t0. (3) Do new variants appear in t3.

Code
genes_par %>%
  filter(home_env == "Home: B" | home_env == "Home: Anc") %>%
  ggplot(aes(y = var, x = name, fill = VAF)) +
  geom_tile() +
  geom_text(aes(y = var, x = name, label = low_ab), color = "white") +
  facet_wrap(HAMBI ~ gene_label, scales = "free", ncol = 3, labeller = label_wrap_gen()) +
  scale_fill_gradient(low = "darkblue", high = "hotpink")

Code
  #theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0))

1.2) Home: BS

Aims: (1) What happens to the variants in t0 parallel genes if the stress is removed (Meas: B) versus if the stress remains (Meas: BS). (2) Do new variants appear in the t3 parallel genes.

Code
genes_par %>%
  filter(home_env == "Home: BS") %>%
  ggplot(aes(y = var, x = name, fill = VAF)) +
  geom_tile() +
  geom_text(aes(y = var, x = name, label = low_ab), color = "white") +
  facet_wrap(HAMBI ~ gene_label, scales = "free", ncol = 3, labeller = label_wrap_gen()) +
  scale_fill_gradient(low = "darkblue", high = "hotpink")

2) Parallel genes distribution

How are parallel genes distributed across different home and measurement conditions?

Code
venn_prep <- genes_par %>%
  mutate(name = paste(home_env, measure_env, sep = " ; ")) %>%
  select(GENE, name) %>%
  distinct() 

venn_prep <- split(venn_prep, venn_prep$name) %>%
  map(\(x) unique(x$GENE))


heatmap_prep <- genes_par %>%
  mutate(name = paste(home_env, measure_env, sep = " ; ")) %>%
  select(GENE, name) %>%
  distinct() %>%
  mutate(value = 1) %>%
  pivot_wider(names_from = "GENE", values_from = "value", values_fill = 0) %>%
  column_to_rownames(var = "name")
x <- as.matrix(heatmap_prep)

p <- ggVennDiagram(venn_prep, order.set.by = c("name"))

venn_table <- p$plotlist[[2]]$data %>%
  arrange(id) %>%
  select(name, item) %>%
  mutate(item = str_remove_all(item, "c\\(|\\)")) %>%
  separate_longer_delim(item, ",") %>%
  mutate(item = str_trim(item),
         item = str_remove_all(item, "\""),
         item = str_remove_all(item, "character\\(0")) %>%
  left_join(distinct(select(variants_par, GENE, gene_label)), by = join_by(item == GENE)) %>%
  mutate(gene_label = paste(item, gene_label)) %>%
  select(-item) %>%
  group_by(name) %>%
  mutate(number = 1:n()) %>%
  pivot_wider(names_from = name, values_from = gene_label) %>%
  select(-number)

2.1) Heatmap

Code
heatmap(x,
        distfun = function(x) vegan::vegdist(x,  method="jaccard", binary=TRUE),
        hclustfun = function(x) hclust(x, method="complete"),
        scale="none", col = c("white", "black"))

2.2) UpSet plot

Instead of a Venn diagram, using UpSet plot to visualise how many parallel genes were in different home+measurement conditions. (Pretty good explanation on how these plots work can be found at https://upset.app/). The shared parallel genes are listed in the table below and should be in the same order as the bars / intersections.

Code
p

Code
rmarkdown::paged_table(venn_table)